Stat 230: Linear Models
Homework 4
Professor Ding
Lev Golod
set.seed(753)
n <- 1e5
combs <- expand.grid(p1=c(1,2,3),
p2=c(1,3,5),
p3=c(1,2,3),
p4=c(0.2,0.5,0.8))
x1 <- function(b) rnorm(n=1, sd=b )
x2 <- function(b) runif(n=1, min=-b, max=b)
x3 <- function(b) rchisq(n=1, df=b)
x4 <- function(b) rbinom(n=1, size=1, prob=b)
# paste0('x',1:4,'(as.numeric(p[',1:4,']))',collapse=', ')
xi <- function(p) c(x1(as.numeric(p[1])), x2(as.numeric(p[2])),
x3(as.numeric(p[3])), x4(as.numeric(p[4])))
sample_fn <- function(j) xi(as.numeric(combs[j,]))
mysamples <- replicate(3, sample(81, n, replace = TRUE))
# mysamples[1:10,1]
# X1 <- t(sapply(mysamples[,1], sample_fn))
# X1 <- t(sapply(mysamples[1:10,1], sample_fn))
# X1 <- unlist(parLapply(cl, mysamples[1:10,1], sample_fn))
# X1 <- matrix(unlist(parLapply(cl, mysamples[,1], sample_fn)),
# ncol=4,byrow = TRUE)
cl <- makeCluster(no_cores, type='FORK')
for (i in 1:3){
text1 <- paste0('X', i,
' <- data.frame(matrix(unlist(parLapply(cl, mysamples[,', i,
'], sample_fn)), ncol=4,byrow = TRUE))')
print(system.time(eval(parse(text=text1))))
# eval(parse(text=text1))
text2 <- paste0('names(X', i, ') <- tolower(names(X', i, '))')
eval(parse(text=text2))
}
## user system elapsed
## 0.141 0.015 2.296
## user system elapsed
## 0.090 0.011 2.106
## user system elapsed
## 0.094 0.009 2.110
stopCluster(cl)
X1$e <- rchisq(n, 5)
X2$e <- rchisq(n, 5) * (1.6 + X2$x3)^0.5
X3$e <- rchisq(n, 5) * (2.5 + X3$x4)^0.5 * X3$x3^0.5
Xlist <- list(X1=X1,X2=X2,X3=X3)
# X1$y <- 1 + X1$x1 + X1$x2 + X1$x3 + X1$e
# X1$y <- 1 + rowSums(X1[, c('x1','x2','x3','e')])
for (i in 1:3){
text1 <- paste0('X', i, '$y <- 1 + rowSums(X', i,
"[, c('x1','x2','x3','e')])")
eval(parse(text=text1))
}
Ns <- list(25,50,100,250,500,1000)
# data_samples <- function(){
# dat <- lapply(Ns, function(x) lapply(Xlist, function(j) j[sample(n, x),]))
# names(dat) <- paste0('N', Ns)
# return(dat)
# }
save.image('./data/q4_a.Rdata')
The HC3 Covariance Estimator seems favorable in terms of size: it tends to
have the lower type I error rate. However, it also has lower power at small
sample sizes, e.g. n = {25, 50}.
For small samples, Hc0 has the best power. This suggests that HC3 is a more
conservative estimate of the covariance, compared to HC1. Once the sample
size increases to abount 100, all of the tests have power close to 1. In the plots we also observe that size decreases and power increases as sample
size increases, which makes sense. The one exception is power for Beta 4.
Since x4 is included in the model, but not included in the data-generating
process for y, the truth is that B4 should be very close to 0. So it makes
sense that power does not increase with sample size for B4.
load('./data/q4_a.Rdata')
B <- 1000
cl <- makeCluster(no_cores, type='FORK')
homo_samp_ind <- parLapply(cl,
as.list(sort(rep(unlist(Ns), B))),
function(j) sample(x=n, size=j, replace = FALSE))
names(homo_samp_ind) <- lapply(homo_samp_ind, length)
homo_samples <- parLapply(cl,
homo_samp_ind,
function(j) X1[j,])
stopCluster(cl)
mylm <- function(dat) lm(y ~ x1 + x2 + x3 + x4, data = dat)
true_coeff <- mylm(X1)$coefficients
# true_coeff <- lm(y ~ x1 + x2 + x3 + x4, data = X1)$coefficients
print(true_coeff)
## (Intercept) x1 x2 x3 x4
## 6.04831 1.00172 0.99920 0.99763 -0.04884
# k = 1; dat = homo_samples[[2030]]; b <- 00
## Function to get 5 p values - corresponding to OLS Cov Mat, HC 0-3
# Input is data, outut is vector of p values
pvals <- function(dat, k, b){
fit <- mylm(dat)
beta_k <- fit$coefficients[k+1]
## OLS covariance matrix
p_samp = 5
n_samp = nrow(dat)
dat$one <- 1
xmat <- as.matrix(select(dat, -c(y,e)))
if (!all.equal(colnames(xmat), c("x1","x2","x3","x4","one"))) stop('Knope!')
xtx_inv <- solve(t(xmat) %*% xmat)
sigma_hat_sq <- sum(fit$residuals^2)/(n_samp-p_samp)
covOLS <- xtx_inv * sigma_hat_sq
mycov <- covOLS
# test_stat_OLS <- abs(beta_k - b )/(mycov[k,k]^0.5)
# p1 <- pt(test_stat_OLS, df = n_samp - p_samp, lower.tail = FALSE)*2
### HC0
# phi_hat <- diag(fit$residuals^2)
# HC0 <- xtx_inv %*% t(xmat) %*% phi_hat %*% xmat %*% xtx_inv
test_stat_OLS <- abs(beta_k - b )/(mycov[k,k]^0.5)
test_stat_HC0 <- abs(beta_k - b )/vcovHC(fit,type='HC0')[k+1,k+1]^0.5
test_stat_HC1 <- abs(beta_k - b )/vcovHC(fit,type='HC1')[k+1,k+1]^0.5
test_stat_HC2 <- abs(beta_k - b )/vcovHC(fit,type='HC2')[k+1,k+1]^0.5
test_stat_HC3 <- abs(beta_k - b )/vcovHC(fit,type='HC3')[k+1,k+1]^0.5
# NB: TWO SIDED HYPOTHESIS TEST
my_pt <- function(test_stat){
pt(test_stat, df = n_samp - p_samp, lower.tail = FALSE)*2
}
ps <- c(my_pt(test_stat_OLS),
my_pt(test_stat_HC0),
my_pt(test_stat_HC1),
my_pt(test_stat_HC2),
my_pt(test_stat_HC3))
names(ps) <- c('OLSCM','HC0','HC1','HC2','HC3')
ps
}
# pvals(homo_samples[[30]], k=1, b=true_coeff[1+2])
####
#### EMPIRICAL SIZE
#### Null hypothesis is Beta = Beta* (from regression w/ the full data)
### Loop through the coefficients Beta1...Beta 4 (j = 1:4)
for (j in 1:4){
## Get p values for all of our samples:
# 1000 samples each for N = 25, 50, ... 1000
cl <- makeCluster(no_cores, type='FORK')
print(system.time(all_pvals_size <- parLapply(cl, homo_samples,
function(x) pvals(dat=x,
k=j,
b=true_coeff[j+1]))
))
stopCluster(cl)
# all_pvals_size[1:3]
pvals_size_mat <- matrix(unlist(all_pvals_size),
ncol=length(all_pvals_size[[1]]),
byrow=TRUE)
## REJECTION <- p < alpha = 0.5
reject_size_mat <- pvals_size_mat<0.05
reject_size_mat <- cbind(as.numeric(names(all_pvals_size)), reject_size_mat)
colnames(reject_size_mat) <- c('n', names(all_pvals_size[[1]]))
## Make a plot of the empirical size
types = c('OLSCM', 'HC0', 'HC1', 'HC2', 'HC3')
# paste0(types, '=mean(', types, ')', collapse = ', ')
size_mat <- data.frame(reject_size_mat) %>%
group_by(n) %>%
summarise(OLSCM=mean(OLSCM), HC0=mean(HC0), HC1=mean(HC1),
HC2=mean(HC2), HC3=mean(HC3))
size_mat
size_mat <- gather(size_mat, type, reject_rate, c(OLSCM,HC0,HC1,HC2,HC3))
size_mat[,1] <- as.numeric(as.factor(unlist(size_mat[,1])))
size_plt <- ggplot(size_mat, aes(x=n, y=reject_rate,
colour = type, shape=type))+
geom_point()+
geom_line()+
ggtitle(paste0('Question 4, Part B, Size, Beta', j))+
theme_bw() +
scale_x_continuous(breaks=unique(unlist(size_mat[,1])),
labels=unlist(Ns))
print(size_plt)
}
## user system elapsed
## 0.068 0.186 26.557
## user system elapsed
## 0.067 0.184 27.782
## user system elapsed
## 0.070 0.208 27.392
## user system elapsed
## 0.067 0.196 26.592
###
### POWER
### Null hypothesis is Beta = 0
# j = 1
for (j in 1:4){
## Get p values for all of our samples:
# 1000 samples each for N = 25, 50, ... 1000
cl <- makeCluster(no_cores, type='FORK')
print(system.time(all_pvals_power <- parLapply(cl, homo_samples,
function(x) pvals(dat=x,
k=j,
b=0))
))
stopCluster(cl)
# all_pvals_power[1:3]
pvals_power_mat <- matrix(unlist(all_pvals_power),
ncol=length(all_pvals_power[[1]]),
byrow=TRUE)
## REJECTION <- p < alpha = 0.5
reject_power_mat <- pvals_power_mat<0.05
reject_power_mat <- cbind(as.numeric(names(all_pvals_power)), reject_power_mat)
colnames(reject_power_mat) <- c('n', names(all_pvals_power[[1]]))
## Make a plot of the empirical power
types = c('OLSCM', 'HC0', 'HC1', 'HC2', 'HC3')
apply(reject_power_mat[reject_power_mat[,1]==25,-1],2,mean)
# paste0(types, '=mean(', types, ')', collapse = ', ')
power_mat <- data.frame(reject_power_mat) %>%
group_by(n) %>%
summarise(OLSCM=mean(OLSCM), HC0=mean(HC0), HC1=mean(HC1),
HC2=mean(HC2), HC3=mean(HC3))
power_mat
power_mat <- gather(power_mat, type, reject_rate,
c(OLSCM,HC0,HC1,HC2,HC3))
power_mat[,1] <- as.numeric(as.factor(unlist(power_mat[,1])))
power_plt <- ggplot(power_mat, aes(x=(n),
y=reject_rate,
colour = type,
shape=type)) +
geom_point()+
geom_line()+
ggtitle(paste0('Question 4, Part B, Power, Beta', j))+
theme_bw() +
scale_x_continuous(breaks=unique(unlist(power_mat[,1])),
labels=unlist(Ns))
print(power_plt)
}
## user system elapsed
## 0.07 0.20 26.54
## user system elapsed
## 0.066 0.192 25.859
## user system elapsed
## 0.067 0.192 26.124
## user system elapsed
## 0.067 0.199 26.390
save.image('./data/q4_b.Rdata')
Regarding size, HC3 seems to be the best once again. It’s interesting to note
that the OLSCM completely fails when it comes to Beta 3. This makes sense,
since the error terms depends on X3.
For small samples, HC0 has the best power. This suggests that HC3 is a more
conservative estimate of the covariance, compared to HC1. For the
homoskedastic data, all of the covariance estimators achieved power close to 1
once sample size grew to 100. For the Heteroskedastic case with X1 and X2,
that doesn’t happen until n=250. This suggests that heteroskedasticity lowers
our power in some situations.
load('./data/q4_b.Rdata')
# B <- 1000
cl <- makeCluster(no_cores, type='FORK')
hetero1_samp_ind <- parLapply(cl,
as.list(sort(rep(unlist(Ns), B))),
function(j) sample(x=n, size=j, replace = FALSE))
names(hetero1_samp_ind) <- lapply(hetero1_samp_ind, length)
hetero1_samples <- parLapply(cl,
hetero1_samp_ind,
function(j) X2[j,])
stopCluster(cl)
# mylm <- function(dat) lm(y ~ x1 + x2 + x3 + x4, data = dat)
true_coeff <- mylm(X2)$coefficients
print(true_coeff)
## (Intercept) x1 x2 x3 x4
## 7.87143 0.99180 0.99922 2.14490 0.03077
####
#### EMPIRICAL SIZE
#### Null hypothesis is Beta = Beta* (from regression w/ the full data)
### Loop through the coefficients Beta1...Beta 4 (j = 1:4)
for (j in 1:4){
## Get p values for all of our samples:
# 1000 samples each for N = 25, 50, ... 1000
cl <- makeCluster(no_cores, type='FORK')
print(system.time(
all_pvals_size <- parLapply(cl, hetero1_samples,
function(x) pvals(dat=x,
k=j,
b=true_coeff[j+1]))))
stopCluster(cl)
# all_pvals_size[1:3]
pvals_size_mat <- matrix(unlist(all_pvals_size),
ncol=length(all_pvals_size[[1]]),
byrow=TRUE)
## REJECTION <- p < alpha = 0.5
reject_size_mat <- pvals_size_mat<0.05
reject_size_mat <- cbind(as.numeric(names(all_pvals_size)), reject_size_mat)
colnames(reject_size_mat) <- c('n', names(all_pvals_size[[1]]))
## Make a plot of the empirical size
types = c('OLSCM', 'HC0', 'HC1', 'HC2', 'HC3')
# paste0(types, '=mean(', types, ')', collapse = ', ')
size_mat <- data.frame(reject_size_mat) %>%
group_by(n) %>%
summarise(OLSCM=mean(OLSCM), HC0=mean(HC0), HC1=mean(HC1),
HC2=mean(HC2), HC3=mean(HC3))
size_mat
size_mat <- gather(size_mat, type, reject_rate, c(OLSCM,HC0,HC1,HC2,HC3))
size_mat[,1] <- as.numeric(as.factor(unlist(size_mat[,1])))
size_plt <- ggplot(size_mat, aes(x=n, y=reject_rate,
colour = type, shape=type))+
geom_point()+
geom_line()+
ggtitle(paste0('Question 4, Part C, Size, Beta', j))+
theme_bw() +
scale_x_continuous(breaks=unique(unlist(size_mat[,1])),
labels=unlist(Ns))
print(size_plt)
}
## user system elapsed
## 0.068 0.212 26.465
## user system elapsed
## 0.066 0.189 25.933
## user system elapsed
## 0.066 0.192 26.049
## user system elapsed
## 0.068 0.194 26.144
###
### POWER
### Null hypothesis is Beta = 0
# j = 1
for (j in 1:4){
## Get p values for all of our samples:
# 1000 samples each for N = 25, 50, ... 1000
cl <- makeCluster(no_cores, type='FORK')
print(system.time(all_pvals_power <- parLapply(cl, hetero1_samples,
function(x) pvals(dat=x,
k=j,
b=0))
))
stopCluster(cl)
# all_pvals_power[1:3]
pvals_power_mat <- matrix(unlist(all_pvals_power),
ncol=length(all_pvals_power[[1]]),
byrow=TRUE)
## REJECTION <- p < alpha = 0.5
reject_power_mat <- pvals_power_mat<0.05
reject_power_mat <- cbind(as.numeric(names(all_pvals_power)), reject_power_mat)
colnames(reject_power_mat) <- c('n', names(all_pvals_power[[1]]))
## Make a plot of the empirical power
types = c('OLSCM', 'HC0', 'HC1', 'HC2', 'HC3')
apply(reject_power_mat[reject_power_mat[,1]==25,-1],2,mean)
# paste0(types, '=mean(', types, ')', collapse = ', ')
power_mat <- data.frame(reject_power_mat) %>%
group_by(n) %>%
summarise(OLSCM=mean(OLSCM), HC0=mean(HC0), HC1=mean(HC1),
HC2=mean(HC2), HC3=mean(HC3))
power_mat
power_mat <- gather(power_mat, type, reject_rate,
c(OLSCM,HC0,HC1,HC2,HC3))
power_mat[,1] <- as.numeric(as.factor(unlist(power_mat[,1])))
power_plt <- ggplot(power_mat, aes(x=(n),
y=reject_rate,
colour = type,
shape=type)) +
geom_point()+
geom_line()+
ggtitle(paste0('Question 4, Part C, Power, Beta', j))+
theme_bw() +
scale_x_continuous(breaks=unique(unlist(power_mat[,1])),
labels=unlist(Ns))
print(power_plt)
}
## user system elapsed
## 0.073 0.205 29.918
## user system elapsed
## 0.067 0.193 28.542
## user system elapsed
## 0.068 0.202 26.780
## user system elapsed
## 0.065 0.181 25.997
save.image('./data/q4_c.Rdata')
I think the BP test is better than the others. I conclude that it’s more
worthwhile to use the Heteroskedasticity-robust variance estimate when there
is actually evidence of Heteroskedasticity.
load('./data/q4_c.Rdata')
cl <- makeCluster(no_cores, type='FORK')
hetero2_samp_ind <- parLapply(cl,
as.list(sort(rep(unlist(Ns), B))),
function(j) sample(x=n, size=j, replace = FALSE))
names(hetero2_samp_ind) <- lapply(hetero2_samp_ind, length)
hetero2_samples <- parLapply(cl,
hetero2_samp_ind,
function(j) X3[j,])
stopCluster(cl)
# mylm <- function(dat) lm(y ~ x1 + x2 + x3 + x4, data = dat)
true_coeff <- mylm(X3)$coefficients
print(true_coeff)
## (Intercept) x1 x2 x3 x4
## 5.1220 0.9944 1.0090 3.7500 1.7219
## BP procedure - fn to get p values
# k = 1; dat = hetero2_samples[[2030]]; b <- true_coeff[k+1]
## Function to get 5 p values - corresponding to OLS Cov Mat, HC 0-3
# Input is data, outut is vector of p values
pvals_bp <- function(dat, k, b){
p_samp = 5
n_samp = nrow(dat)
fit <- mylm(dat)
beta_k <- fit$coefficients[k+1]
# NB: TWO SIDED HYPOTHESIS TEST
my_pt <- function(test_stat){
pt(test_stat, df = n_samp - p_samp, lower.tail = FALSE)*2
}
## If we reject the Null in the BP test, we have heteroskedasticity. In
# that case, use the HC covariances. Else use the OLS.
if (bptest(fit)$`p.value` < 0.05){
# Hetroskdasticity - use HC0..HC3
test_stat_HC0 <- abs(beta_k - b )/vcovHC(fit,type='HC0')[k+1,k+1]^0.5
test_stat_HC1 <- abs(beta_k - b )/vcovHC(fit,type='HC1')[k+1,k+1]^0.5
test_stat_HC2 <- abs(beta_k - b )/vcovHC(fit,type='HC2')[k+1,k+1]^0.5
test_stat_HC3 <- abs(beta_k - b )/vcovHC(fit,type='HC3')[k+1,k+1]^0.5
ps <- c(my_pt(test_stat_HC0),
my_pt(test_stat_HC1),
my_pt(test_stat_HC2),
my_pt(test_stat_HC3))
} else{
# Homoskedasticity: use OLS CM but repeat it 4 times
dat$one <- 1
xmat <- as.matrix(select(dat, -c(y,e)))
if (!all.equal(colnames(xmat), c("x1","x2","x3","x4","one"))) stop('Knope!')
xtx_inv <- solve(t(xmat) %*% xmat)
sigma_hat_sq <- sum(fit$residuals^2)/(n_samp-p_samp)
mycov <- xtx_inv * sigma_hat_sq
test_stat_OLS <- abs(beta_k - b )/(mycov[k,k]^0.5)
ps <- rep(my_pt(test_stat_OLS), 4)
}
names(ps) <- c('bp_HC0','bp_HC1','bp_HC2','bp_HC3')
ps
}
# pvals_bp(hetero2_samples[[2030]], k=1,b=true_coeff[2])
####
#### EMPIRICAL SIZE
#### Null hypothesis is Beta = Beta* (from regression w/ the full data)
### Loop through the coefficients Beta1...Beta 4 (j = 1:4)
for (j in 1:4){
## Get p values for all of our samples:
# 1000 samples each for N = 25, 50, ... 1000
cl <- makeCluster(no_cores, type='FORK')
print(system.time(
all_pvals_size <- parLapply(cl, hetero2_samples,
function(x) pvals_bp(dat=x,
k=j,
b=true_coeff[j+1]))))
stopCluster(cl)
# all_pvals_size[1:3]
pvals_size_mat <- matrix(unlist(all_pvals_size),
ncol=length(all_pvals_size[[1]]),
byrow=TRUE)
## REJECTION <- p < alpha = 0.5
reject_size_mat <- pvals_size_mat<0.05
reject_size_mat <- cbind(as.numeric(names(all_pvals_size)), reject_size_mat)
colnames(reject_size_mat) <- c('n', names(all_pvals_size[[1]]))
## Make a plot of the empirical size
types = c('bp_HC0', 'bp_HC1', 'bp_HC2', 'bp_HC3')
# paste0(types, '=mean(', types, ')', collapse = ', ')
size_mat <- data.frame(reject_size_mat) %>%
group_by(n) %>%
summarise(bp_HC0=mean(bp_HC0), bp_HC1=mean(bp_HC1),
bp_HC2=mean(bp_HC2), bp_HC3=mean(bp_HC3))
size_mat
# cat(paste0(types,collapse=','))
size_mat <- gather(size_mat, type, reject_rate,
c(bp_HC0,bp_HC1,bp_HC2,bp_HC3))
size_mat[,1] <- as.numeric(as.factor(unlist(size_mat[,1])))
size_plt <- ggplot(size_mat, aes(x=n, y=reject_rate,
colour = type, shape=type))+
geom_point()+
geom_line()+
ggtitle(paste0('Question 4, Part D, Size, Beta', j))+
theme_bw() +
scale_x_continuous(breaks=unique(unlist(size_mat[,1])),
labels=unlist(Ns))
print(size_plt)
}
## user system elapsed
## 0.082 0.298 25.187
## user system elapsed
## 0.066 0.186 26.325
## user system elapsed
## 0.069 0.198 25.086
## user system elapsed
## 0.066 0.180 26.790
save.image('./data/q4_d.Rdata')